data_sak_raw |>
ggplot(aes(Layers)) +
geom_histogram(binwidth = 1)
quantile(data_sak_raw$Layers, na.rm = TRUE) # 1,2,3,4,15
0% 25% 50% 75% 100%
1 2 3 4 15
quantile(data_sak_raw$class, na.rm = TRUE) # 1,2,3,4,15
0% 25% 50% 75% 100%
58 84 91 111 124
data_sak_raw |>
ggplot(aes(Layers, Functions)) +
geom_count(alpha = 0.3)
# select(which(!colSums(data_sak_raw, na.rm=TRUE) %in% 0) | is.character()) # 合計が0, 1 もしくはnrow(), nrow()-1のものはuninformativeなので削除してよいはず
df_sak |>
filter(Price <30000, class < 112) |>
ggplot(aes(x = Price, y = Functions, colour = class, group = class)) +
geom_point() +
geom_smooth(method = "lm")
`geom_smooth()` using formula 'y ~ x'
Warning in qt((1 - level)/2, df) : NaNs produced
Warning in max(ids, na.rm = TRUE) :
no non-missing arguments to max; returning -Inf
data https://www1.doshisha.ac.jp/~mjin/R/Chap_26/26.html use FactoMineR instead of MASS to get eigen values
df_sak_factor <- df_sak |>
# filter(Price ) |> # remove swissChampXAVT
select(Layer1:`MAT_Scale:BrassWood`) |>
mutate(across(everything(), as.factor))
sak_mca <- MCA(df_sak_factor, ncp = 2, graph = TRUE)
df_sak_mca_coord <- sak_mca$ind$coord |>
as_tibble() |>
transmute(mca_x = `Dim 1`, mca_y = `Dim 2`)
# http://www.sthda.com/english/articles/31-principal-component-methods-in-r-practical-guide/114-mca-multiple-correspondence-analysis-in-r-essentials/
# extract eigen value
sak_corresp_eigen_values <- get_eigenvalue(sak_mca)
# check eigen v
fviz_screeplot(sak_mca, addlabels = TRUE, ylim = c(0, 20))
Registered S3 method overwritten by 'data.table':
method from
print.data.table
# variance explained by dim1 and dim2 are:
# they are later used in the ggplot
sak_corresp_dim1_ev <- sak_corresp_eigen_values[1, 2]
sak_corresp_dim2_ev <- sak_corresp_eigen_values[2, 2]
# coordinates
df_sak_factor2 <- bind_cols(df_sak, df_sak_mca_coord)
cut the trees
https://uc-r.github.io/hc_clustering
df_fviz_wss <- df_sak_corresp |>
fviz_nbclust(FUN = hcut, method = "wss", k.max = 30)
Registered S3 method overwritten by 'data.table':
method from
print.data.table
df_fviz_wss <- df_sak_corresp |>
fviz_nbclust(FUN = hcut, method = "wss", k.max = 30)
df_fviz_wss2 <- df_fviz_wss$data |>
mutate(clusters = as.numeric(clusters))
df_fviz_wss2 |>
ggplot(aes(clusters, y)) +
geom_line() +
# geom_point() +
ylab("Total WSS") +
xlab("No of clusters k") +
theme_classic(base_family = "Fira Code")
SplitsTree4のUncorrectedPがHamming距離らしいので、{cultevo}を使ってHamming距離を算出する。しかし全然違う結果になる・・・なぜ?
sak_dist <- hammingdists(df_sak_factor) # 5sec. sometimes it's finished very fast but it means it's not running...? in that case, restart R
sak_upgma <- sak_dist |>
hclust(method = "average") |> # average = UPGMA
as.phylo()
sak_upgma$tip.label <- df_sak$short_name
sak_upgma |>
plot.phylo(type = "u", use.edge.length = TRUE, lab4ut = "axial")
sak_nj <- sak_dist |> nj()
sak_nj$tip.label <- df_sak$short_name
sak_nj |>
plot.phylo(type = "u", use.edge.length = TRUE, lab4ut = "axial")
df_sak_rowid <- df_sak |>
rowid_to_column(var = "node") |>
mutate(node = node |> as.integer())
df_sak_upgma <- sak_upgma |>
as_tibble() |>
left_join(df_sak_rowid, by = "node") |>
select(parent:class, short_name, Functions, width, weight, id) |>
mutate(short_name = if_else(is.na(short_name), node |> as.character(), short_name)) |>
column_to_rownames(var = "short_name") |>
mutate(node = node |> as.numeric()) |>
select(node, class, id, Functions, weight, width) # temporarily limite
sak_upgma$tip.label <- df_sak$short_name
df_sak_nj <- sak_nj |>
as_tibble() |>
left_join(df_sak_rowid, by = "node") |>
select(parent:class, short_name, Functions, width, weight, id) |>
mutate(short_name = if_else(is.na(short_name), node |> as.character(), short_name)) |>
column_to_rownames(var = "short_name") |>
mutate(node = node |> as.numeric()) |>
select(node, class, id, Functions, weight, width)
phytools fastAnc
# create a named number vector of weights (in gram) of SAKs
sak_trait_weight <- df_sak_upgma |>
select(weight) |>
as.matrix()
# change the matrix into a vector, then cull unwanted nodes after 57
sak_trait_weight_named_num <- sak_trait_weight[, 1][1:nrow(df_sak)]
# now fit the data to nodes using UPGMA tree
sak_fit_upgma_weight <- fastAnc(
sak_upgma,
sak_trait_weight_named_num,
vars = TRUE,
CI = TRUE)
df_sak_upgma$weight[(nrow(df_sak) + 1): nrow(df_sak_upgma)] <- sak_fit_upgma_weight$ace
sak_fit_nj_weight <- fastAnc(
sak_nj,
sak_trait_weight_named_num,
vars = TRUE,
CI = TRUE)
df_sak_nj$weight[(nrow(df_sak) + 1): nrow(df_sak_nj)] <- sak_fit_nj_weight$ace
# join with tree. seems impossible but possible :D
sak_upgma2 <- full_join(sak_upgma, df_sak_upgma, by = "node")
sak_nj2 <- full_join(sak_nj, df_sak_nj, by = "node")
sak_upgma2 |>
ggtree(layout = "ape", aes(colour = weight)) +
geom_tiplab(aes(label = paste0(class, label),), family = "Fira Code", size = 2, hjust = -.15) + # model name (e.g., Explorer)
geom_tiplab(aes(label = weight |> round(0)), family = "Fira Code", size = 1, hjust = 1) + # weight of models
geom_nodelab(aes(label = weight |> round(0)), family = "Fira Code", size = 1, colour = "black") + # estimated weights at nodes
geom_treescale() +
scale_size_area(max_size = 5) +
scale_colour_viridis_c(limits = c(15, 206)) +
coord_cartesian(clip = 'off') +
theme_tree(
plot.margin=margin(40, 80, 40, 60),
) +
theme(
legend.position = c(0.05, .8),
legend.key.size = unit(2, 'mm'),
)
Coordinate system already present. Adding new coordinate system, which will replace the existing one.
sak_nj2 |>
ggtree(layout = "ape", aes(colour = weight)) +
geom_tiplab(aes(label = paste0(class, label),), family = "Fira Code", size = 3, hjust = -.3) + # model name (e.g., Explorer)
geom_tiplab(aes(label = weight |> round(0)), family = "Fira Code", size = 2, hjust = 1) + # weight of models
geom_nodelab(aes(label = weight |> round(0)), family = "Fira Code", size = 2, vjust = 1) + # estimated weights at nodes
geom_treescale() +
scale_size_area(max_size = 5) +
scale_colour_viridis_c(limits = c(15, 206)) +
coord_cartesian(clip = 'off') +
theme_tree(
plot.margin=margin(120, 40, 40, 40),
) +
theme(
legend.position = c(0.1, .9),
legend.key.size = unit(2, 'mm'),
)
Coordinate system already present. Adding new coordinate system, which will replace the existing one.
sak_nj2 |>
root(outgroup = 45, edgelabel = TRUE) |>
ggtree(aes(colour = weight)) +
geom_tiplab(aes(label = paste0(class, label),), family = "Fira Code", size = 3, hjust = -.3) + # model name (e.g., Explorer)
geom_tiplab(aes(label = weight |> round(0)), family = "Fira Code", size = 2, hjust = 1) + # weight of models
geom_nodelab(aes(label = weight |> round(0)), fill = "white", family = "Fira Code", size = 2) + # estimated weights at nodes
geom_treescale() +
scale_size_area(max_size = 5) +
scale_colour_viridis_c(limits = c(15, 206)) +
# coord_cartesian(clip = 'off') +
theme_tree(
plot.margin=margin(40, 40, 40, 40),
) +
theme(
legend.position = c(0.15, .5),
legend.key.size = unit(2, 'mm'),
)
create phyDat data for categorical phenotype reconstruction
df_sak_factor_with_name <- df_sak_factor |>
mutate(name = df_sak$short_name) |>
relocate(name)
phyDat_sak <-
df_sak_factor_with_name |>
rownames_to_column() |>
pivot_longer(-name, 'variable', 'value') |>
pivot_wider(variable, name) |>
filter(variable != "rowname") |>
mutate(across(everything(), as.character)) |>
column_to_rownames(var = "variable") |>
phyDat(type = "USER", levels = c("0", "1"))
phyDat_sak |> View()
plot isn’t really good. use SplitsTree4 using write.nexus()?
sak_dist2 <- phyDat_sak |> dist.hamming() # must be phyDat
Error in dist.hamming(phyDat_sak) : object 'phyDat_sak' not found
sak_nn |> ggsplitnet(aes(x, y))+
geom_splitnet(size = .1) +
geom_tiplab2(data = df_sak_nn, aes(colour = class), size = 2, family = "Fira Code") +
theme_tree() +
xlim(-.25, .35) +
ylim(-.25,.25)
Error in FUN(X[[i]], ...) : object 'angle' not found
getwd()
df_sak_factor_with_name |>
write_tsv("./data/sak_factor_for_mesquite_nex.tsv")
sak_nj |> write.nexus("data/sak_factor.nex")